by: Yannadatch Ouchalern, Last Updated: Oct 22nd, 2021
#import library:
library(tidyverse)
library(ggplot2)
library(sqldf)
library(janitor) #for clean_names()
library(lubridate)
library(dplyr)
library(ggcorrplot)
library(here)
library(kableExtra) #create kable
library(knitr)
library(rmarkdown)
library(ggalluvial) #flow chart
library(plotly) #interactive visualization
library(ggthemes) # visualisation
library(patchwork) # visualisation
library(ggpubr) ##ggarrange
library('timetk') # time series analysis
theme_set(theme_minimal())
sales_train <- read.csv("sales_train_validation.csv")
calendar <- read.csv("calendar.csv",na.strings=c("","NA"))
sales_train %>%
select(seq(1,10,1)) %>%
head() %>%
paged_table()
calendar %>%
head() %>%
paged_table()
Transform our wide data into a long format with a dates column in date format
#create function wide data into a long format
transform_df <- function(df) {
min_date <- date("2011-01-29")
df %>%
select(id, starts_with("d_")) %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1) %>%
mutate(id = str_remove(id, "_validation"))
}
#transform data
agg_unit_df <- sales_train %>%
summarise_at(vars(starts_with("d_")), sum) %>%
mutate(id = 1)
agg_unit_df_ts <- transform_df(agg_unit_df)
#visualization
ggplotly(
agg_unit_df_ts %>%
ggplot(aes(x = dates, y = sales)) +
geom_line(col = "deepskyblue2") +
geom_smooth(aes(x = dates, y = sales), fill = 'grey', se = FALSE) +
labs(title = 'All Aggregate unit sales')
)
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
reference : https://www.goodhousekeeping.com/holidays/christmas-ideas/a28966959/walmart-hours-open-on-christmas-day/.
#transform data
monthly_sales_df <- sales_train %>%
group_by(state_id) %>%
summarise_at(vars(starts_with("d_")),sum) %>%
rename(id = state_id)
monthly_sales_df_ts <- transform_df(monthly_sales_df) %>%
mutate(month = month(dates),
year = year(dates)) %>%
group_by(month, year,id) %>%
summarise(sales = sum(sales),
dates = min(dates)) %>%
ungroup() %>%
filter(str_detect(as.character(dates),"..-..-01")) %>%
filter(dates != max(dates))
## `summarise()` has grouped output by 'month', 'year'. You can override using the `.groups` argument.
#visualization
ggplotly(
monthly_sales_df_ts %>%
ggplot(aes(x = dates, y = sales, col = id)) +
geom_line() +
labs(title = 'Sales by state')
)
min_date <- date("2011-01-29")
monthly_sales_cat_df <- sales_train %>%
group_by(state_id, cat_id) %>%
summarise_at(vars(starts_with("d_")),sum) %>%
rename(id = state_id)
monthly_sales_cat_df_ts <- monthly_sales_cat_df %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1)
# Quarterly sales visualization
ggplotly(
monthly_sales_cat_df_ts %>%
mutate(quarter = quarter(dates)) %>%
select(-dates) %>%
group_by(id, cat_id, quarter) %>%
summarise(sales = sum(sales)) %>%
ggplot(aes(x = quarter, y = sales, col = cat_id)) +
geom_line() +
geom_point() +
theme(axis.title.y =element_blank()) +
facet_wrap(~id) +
labs(title = 'Quarterly sales by state, category',
xlabel = quarter)
)
## `summarise()` has grouped output by 'id', 'cat_id'. You can override using the `.groups` argument.
# Monthly sales visualization
ggplotly(
monthly_sales_cat_df_ts %>%
mutate(month = month(dates)) %>%
select(-dates) %>%
group_by(id, cat_id, month) %>%
summarise(sales = sum(sales)) %>%
ggplot(aes(x = month, y = sales, col = cat_id)) +
geom_line() +
geom_point() +
theme(axis.title.y =element_blank(),
axis.title.x = element_text(vjust = 1)) +
facet_wrap(~id) +
labs(title = 'Monthly sales by state, category',
xlabel = month)
)
## `summarise()` has grouped output by 'id', 'cat_id'. You can override using the `.groups` argument.
#Weekly sales Visualization
ggplotly(
monthly_sales_cat_df_ts %>%
mutate(wday = wday(dates, label = TRUE, week_start = 1)) %>%
select(-dates) %>%
group_by(id, cat_id, wday) %>%
summarise(sales = sum(sales)) %>%
ggplot(aes(x = wday, y = sales, fill = id)) +
geom_col() +
theme(axis.title.y =element_blank(),
axis.title.x = element_text(vjust = 0)) +
facet_wrap(~id) +
labs(title = 'Weekday sales by state')
)
## `summarise()` has grouped output by 'id', 'cat_id'. You can override using the `.groups` argument.
month_sales_ts <- transform_df(monthly_sales_df) %>%
mutate(id = 1) %>%
filter(!str_detect(as.character(dates), "-12-25"))
loess_all <- predict(loess(month_sales_ts$sales ~ as.integer(month_sales_ts$dates - min(month_sales_ts$dates)) +1, span = 1/2, degree = 1))
P1 <- month_sales_ts %>%
mutate(loess = loess_all) %>%
mutate(sales_rel = sales - loess)
#heatmap visualization
P1 %>%
mutate(wday = wday(dates, label = TRUE, week_start = 1),
month = month(dates,label = TRUE),
year = year(dates)) %>%
group_by(wday, month, year) %>%
summarise(sales = sum(sales_rel)/1e3) %>%
ggplot(aes(month, wday, fill = sales)) +
geom_tile() +
labs(title = 'Weekday Vs Month sales heatmap',
x = "Month of the year", y = "Day of the week", fill = "Relative Sales [1k]") +
scale_fill_distiller(palette = "Spectral")
## `summarise()` has grouped output by 'wday', 'month'. You can override using the `.groups` argument.
calendar <- calendar %>% mutate(date = ymd(date))
month_sales_ts <- monthly_sales_cat_df %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1) %>%
mutate(id = 1) %>%
filter(!str_detect(as.character(dates), "-12-25"))
P1 <- month_sales_ts %>%
left_join(calendar %>% select(date, event_type_1), by = c("dates"="date")) %>%
group_by(cat_id) %>%
mutate(loess = predict(loess(sales ~ as.integer(dates - min(dates)) + 1, span = 1/2, degree = 1)),
mean_sales = mean(sales)) %>%
mutate(sales_rel = (sales-loess)/mean_sales) %>%
mutate(is_event = !is.na(event_type_1))
#Sales by special events vs non events by sates visualization
P1 %>%
ggplot(aes(dates, sales/1e3, group = is_event, col = is_event)) +
geom_line(alpha = 0.3) +
geom_line(aes(dates, loess/1e3), col = "darkgrey", linetype = 2) +
geom_smooth(method = "loess", formula = 'y ~ x', span = 2/3, se = FALSE) +
scale_colour_manual(values = c("grey", "deepskyblue2")) +
geom_vline(xintercept = 2, colour ="black") +
facet_wrap(~cat_id, scales="free") +
labs(title = 'Sales by special events vs non events by category',
x = 'dates', y = 'Sales/1k')
#Relative Sales Viz
P2 <- P1 %>%
ggplot(aes(x = cat_id, y = sales_rel, fill = is_event)) +
geom_boxplot() +
coord_flip() +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
labs(y = "Relative Sales",
title = "Relative Sales by category") +
theme(axis.title.y =element_blank(),
legend.position = "none")
P3 <- P1 %>%
filter(is_event == TRUE) %>%
rename(event_type = event_type_1) %>%
group_by(cat_id, event_type) %>%
summarise(sales = median(sales_rel)) %>%
ggplot(aes(cat_id, sales, fill = event_type)) +
geom_col(position = "dodge") +
coord_flip() +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank()) +
labs(title = 'Median Relative Sales',
y = 'median relative sales') +
scale_fill_manual(values = c("grey", "grey", "grey", "orange"))
## `summarise()` has grouped output by 'cat_id'. You can override using the `.groups` argument.
ggarrange(P2,P3, ncol = 2, nrow = 1)
calendar <- calendar %>% mutate(date = ymd(date))
month_sales_ts <- monthly_sales_df %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1) %>%
filter(!str_detect(as.character(dates), "-12-25"))
P1 <- month_sales_ts %>%
left_join(calendar %>% select(date, event_type_1), by = c("dates"="date")) %>%
rename(state_id = id ) %>%
group_by(state_id) %>%
mutate(loess = predict(loess(sales ~ as.integer(dates - min(dates)) + 1, span = 1/2, degree = 1)),
mean_sales = mean(sales)) %>%
mutate(sales_rel = (sales-loess)/mean_sales) %>%
mutate(is_event = !is.na(event_type_1)) %>%
ungroup()
#Sales by special events vs non events by sates visualization
P1 %>%
ggplot(aes(dates, sales/1e3, group = is_event, col = is_event)) +
geom_line(alpha = 0.3) +
geom_line(aes(dates, loess/1e3), col = "darkgrey", linetype = 2) +
geom_smooth(method = "loess", formula = 'y ~ x', span = 2/3, se = FALSE) +
scale_colour_manual(values = c("grey", "deepskyblue2")) +
facet_wrap(~state_id) +
labs(title = 'Sales by special events vs non events by sates',
x = 'dates', y = 'Sales/1k')
#Relative Sales Viz
P2 <- P1 %>%
ggplot(aes(x = state_id, y = sales_rel, fill = is_event)) +
geom_boxplot() +
coord_flip() +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
labs(y = "Relative Sales",
title = "Relative Sales by states") +
theme(axis.title.y =element_blank(),
legend.position = "none")
P3 <- P1 %>%
filter(is_event == TRUE) %>%
rename(event_type = event_type_1) %>%
group_by(state_id, event_type) %>%
summarise(sales = median(sales_rel)) %>%
ggplot(aes(state_id, sales, fill = event_type)) +
geom_col(position = "dodge") +
coord_flip() +
theme(axis.title.y = element_blank(),
axis.text.y = element_blank()) +
labs(title = 'Median Relative Sales',
y = 'median relative sales') +
scale_fill_manual(values = c("blue", "grey", "grey", "orange"))
## `summarise()` has grouped output by 'state_id'. You can override using the `.groups` argument.
P2 + P3
The United States federal government provides a nutrition assistance benefit called the Supplement Nutrition Assistance Program (SNAP). SNAP provides low income families and individuals with an Electronic Benefits Transfer debit card to purchase food products. In many states, the monetary benefits are dispersed to people across 10 days of the month and on each of these days 1/10 of the people will receive the benefit on their card. More information about the SNAP program can be found here.
snap_calendar <- calendar %>%
select(date, starts_with('snap')) %>%
pivot_longer(starts_with("snap"), names_to = "state", values_to = "snap") %>%
mutate(state = str_sub(state, 6,7)) %>%
tk_augment_timeseries_signature()
## tk_augment_timeseries_signature(): Using the following .date_var variable: date
P1 <- snap_calendar %>%
filter(!is.na(day)) %>%
filter(year == 2015 & month <=6 & state == 'CA') %>%
ggplot(aes(mweek, fct_rev(wday.lbl), fill = as.logical(snap))) +
geom_tile(colour = "white") +
geom_text(aes(label = day), size = 2) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
facet_grid(year~month.lbl) +
theme_tufte() +
theme(legend.position = "none", axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 8)) +
labs(x="", y="", title = "California")
P2 <- snap_calendar %>%
filter(!is.na(day)) %>%
filter(year == 2015 & month <= 6 & state == 'TX') %>%
ggplot(aes(mweek, fct_rev(wday.lbl), fill = as.logical(snap))) +
geom_tile(colour = "white") +
geom_text(aes(label = day), size = 2) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
facet_grid(year~month.lbl) +
theme_tufte() +
theme(legend.position = "none", axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 8)) +
labs(x="", y="", title = "Texas")
P3 <- snap_calendar %>%
filter(!is.na(day)) %>%
filter(year == 2015 & month <= 5 & state == 'WI') %>%
ggplot(aes(mweek, fct_rev(wday.lbl), fill = as.logical(snap))) +
geom_tile(colour = "white") +
geom_text(aes(label = day), size = 2) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
facet_grid(year~month.lbl) +
theme_tufte() +
theme(legend.position = "none", axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 8)) +
labs(x="", y="", title = "Wisconsin")
P1 / P2 / P3 +
plot_annotation(title = 'SNAP day for each state')
snap_calendar <- calendar %>%
select(date, starts_with("snap")) %>%
pivot_longer(starts_with("snap"), names_to = "state_id", values_to = "snap") %>%
mutate(state_id = str_replace(state_id, "snap_", ""))
df <- transform_df(monthly_sales_df) %>%
rename(state_id = id) %>%
left_join(snap_calendar, by = c("dates" = "date", "state_id")) %>%
filter(!str_detect(as.character(dates), "-12-25")) %>%
mutate(snap = as.logical(snap)) %>%
group_by(state_id) %>%
mutate(loess = predict(loess(sales ~ as.integer(dates - min(dates)) + 1, span = 1/2, degree = 1)),
mean_sales = mean(sales)) %>%
mutate(sales_rel = (sales - loess)/mean_sales) %>%
ungroup()
P1 <- df %>%
ggplot(aes(dates, sales/1e3, group = snap, col = snap)) +
geom_line(aes(dates, loess/1e3), col = "black", linetype = 2) +
geom_line(alpha = 0.3) +
geom_smooth(method = "loess", formula = 'y ~ x', span = 2/3, se = FALSE) +
scale_colour_manual(values = c("grey", "deepskyblue2")) +
facet_wrap(~ state_id) +
theme_hc() +
theme(legend.position = "right") +
labs(x = "", y = "Sales [$1k]", col = "SNAP day", title = "Sales by State on SNAP days vs other")
P2 <- df %>%
group_by(state_id, snap) %>%
summarise(sales = sum(sales),
count = n()) %>%
mutate(sales_daily = sales/count) %>%
add_tally(sales_daily, name = "total") %>%
mutate(percentage = round((sales_daily/total)*100,2)) %>%
ggplot(aes(x = state_id, y = percentage, fill = snap)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
geom_text(aes(label = sprintf("%.1f %%", percentage)), position = position_dodge(0.9), vjust = 1.2, size = 4) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
axis.title.x = element_blank(),
axis.text.x.bottom = element_text(size = 10))+
labs(x = "", y = "", title = "Daily Sales Percentage")
## `summarise()` has grouped output by 'state_id'. You can override using the `.groups` argument.
P1 / P2 + plot_layout(guides = 'collect')
snap_calendar <- calendar %>%
select(date, starts_with("snap")) %>%
pivot_longer(starts_with("snap"), names_to = "state_id", values_to = "snap") %>%
mutate(state_id = str_replace(state_id, "snap_", ""))
df <- monthly_sales_cat_df %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1) %>%
rename(state_id = id) %>%
left_join(snap_calendar, by = c("dates" = "date", "state_id")) %>%
filter(!str_detect(as.character(dates), "-12-25")) %>%
mutate(snap = as.logical(snap)) %>%
group_by(state_id, cat_id) %>%
mutate(loess = predict(loess(sales ~ as.integer(dates - min(dates)) + 1, span = 1/2, degree = 1)),
mean_sales = mean(sales)) %>%
mutate(sales_rel = (sales - loess)/mean_sales) %>%
ungroup()
P1 <- df %>%
group_by(state_id, cat_id, snap) %>%
summarise(sales = sum(sales),
count = n()) %>%
mutate(sales_daily = sales/count) %>%
add_tally(sales_daily, name = "total") %>%
mutate(percentage = sales_daily/total) %>%
ggplot(aes(x = cat_id, y = percentage, fill = snap)) +
geom_col(position = "dodge") +
facet_wrap(~ state_id, nrow = 1) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
scale_y_continuous(labels = scales::percent) +
theme(axis.title.x = element_blank(),
axis.text.x.bottom = element_text(size = 6)) +
labs(x = "", y = "", title = "Daily Sales Percentage for SNAP by category")
## `summarise()` has grouped output by 'state_id', 'cat_id'. You can override using the `.groups` argument.
P1
snap_calendar <- calendar %>%
select(date, starts_with('snap')) %>%
pivot_longer(starts_with("snap"), names_to = "state", values_to = "snap") %>%
mutate(state = str_sub(state, 6,7)) %>%
tk_augment_timeseries_signature()
## tk_augment_timeseries_signature(): Using the following .date_var variable: date
P1 <- snap_calendar %>%
filter(!is.na(day)) %>%
filter(year == 2015 & month <=6 & state == 'CA') %>%
ggplot(aes(mweek, fct_rev(wday.lbl), fill = as.logical(snap))) +
geom_tile(colour = "white") +
geom_text(aes(label = day), size = 2) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
facet_grid(year~month.lbl) +
theme_tufte() +
theme(legend.position = "none", axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 10)) +
labs(x="", y="")
P2 <- snap_calendar %>%
filter(!is.na(day)) %>%
filter(year == 2015 & month >=7 & state == 'CA') %>%
ggplot(aes(mweek, fct_rev(wday.lbl), fill = as.logical(snap))) +
geom_tile(colour = "white") +
geom_text(aes(label = day), size = 2) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
facet_grid(year~month.lbl) +
theme_tufte() +
theme(legend.position = "none", axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 10)) +
labs(x="", y="")
P1 / P2 +
plot_annotation(title = 'SNAP day for California (CA)')
snap_calendar <- calendar %>%
select(date, starts_with('snap')) %>%
pivot_longer(starts_with("snap"), names_to = "state", values_to = "snap") %>%
mutate(state = str_sub(state, 6,7)) %>%
rename(state_id = state)
df <- monthly_sales_cat_df %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1) %>%
rename(state_id = id) %>%
left_join(snap_calendar, by = c("dates" = "date", "state_id")) %>%
filter(!str_detect(as.character(dates), "-12-25")) %>%
mutate(snap = as.logical(snap)) %>%
group_by(state_id, cat_id) %>%
mutate(loess = predict(loess(sales ~ as.integer(dates - min(dates)) + 1, span = 1/2, degree = 1)),
mean_sales = mean(sales)) %>%
mutate(sales_rel = (sales - loess)/mean_sales) %>%
ungroup()
P1 <- df %>%
filter(state_id == "CA" & cat_id == "FOODS") %>%
mutate(wday = wday(dates, label = TRUE, week_start = 1),
month = month(dates, label = TRUE),
year = year(dates)) %>%
group_by(wday, month, snap) %>%
summarise(sales = sum(sales_rel)) %>%
pivot_wider(names_from = "snap", values_from = "sales", names_prefix = "snap") %>%
mutate(snap_effect = snapTRUE - snapFALSE) %>%
ggplot(aes(month, wday, fill = snap_effect)) +
geom_tile() +
#labs(x = "Month of the year", y = "Day of the week", fill = "SNAP effect") +
scale_fill_distiller(palette = "Spectral") +
labs(x = "", y = "", fill = "SNAP effect",title = "SNAP impact by weekday & month",
subtitle = "Relative sales of SNAP days - other days. Only FOODS category and state CA.")
## `summarise()` has grouped output by 'wday', 'month'. You can override using the `.groups` argument.
P1
snap_calendar <- calendar %>%
select(date, starts_with('snap')) %>%
pivot_longer(starts_with("snap"), names_to = "state", values_to = "snap") %>%
mutate(state = str_sub(state, 6,7)) %>%
tk_augment_timeseries_signature()
## tk_augment_timeseries_signature(): Using the following .date_var variable: date
P1 <- snap_calendar %>%
filter(!is.na(day)) %>%
filter(year == 2015 & month <=6 & state == 'WI') %>%
ggplot(aes(mweek, fct_rev(wday.lbl), fill = as.logical(snap))) +
geom_tile(colour = "white") +
geom_text(aes(label = day), size = 2) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
facet_grid(year~month.lbl) +
theme_tufte() +
theme(legend.position = "none", axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 10)) +
labs(x="", y="")
P2 <- snap_calendar %>%
filter(!is.na(day)) %>%
filter(year == 2015 & month >=7 & state == 'WI') %>%
ggplot(aes(mweek, fct_rev(wday.lbl), fill = as.logical(snap))) +
geom_tile(colour = "white") +
geom_text(aes(label = day), size = 2) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
facet_grid(year~month.lbl) +
theme_tufte() +
theme(legend.position = "none", axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 10)) +
labs(x="", y="")
P1 / P2 +
plot_annotation(title = 'SNAP day for Wisconsin (WI)')
snap_calendar <- calendar %>%
select(date, starts_with('snap')) %>%
pivot_longer(starts_with("snap"), names_to = "state", values_to = "snap") %>%
mutate(state = str_sub(state, 6,7)) %>%
rename(state_id = state)
df <- monthly_sales_cat_df %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1) %>%
rename(state_id = id) %>%
left_join(snap_calendar, by = c("dates" = "date", "state_id")) %>%
filter(!str_detect(as.character(dates), "-12-25")) %>%
mutate(snap = as.logical(snap)) %>%
group_by(state_id, cat_id) %>%
mutate(loess = predict(loess(sales ~ as.integer(dates - min(dates)) + 1, span = 1/2, degree = 1)),
mean_sales = mean(sales)) %>%
mutate(sales_rel = (sales - loess)/mean_sales) %>%
ungroup()
P1 <- df %>%
filter(state_id == "WI" & cat_id == "FOODS") %>%
mutate(wday = wday(dates, label = TRUE, week_start = 1),
month = month(dates, label = TRUE),
year = year(dates)) %>%
group_by(wday, month, snap) %>%
summarise(sales = sum(sales_rel)) %>%
pivot_wider(names_from = "snap", values_from = "sales", names_prefix = "snap") %>%
mutate(snap_effect = snapTRUE - snapFALSE) %>%
ggplot(aes(month, wday, fill = snap_effect)) +
geom_tile() +
#labs(x = "Month of the year", y = "Day of the week", fill = "SNAP effect") +
scale_fill_distiller(palette = "Spectral") +
labs(x = "", y = "", fill = "SNAP effect",title = "SNAP impact by weekday & month",
subtitle = "Relative sales of SNAP days - other days. Only FOODS category and state WI")
## `summarise()` has grouped output by 'wday', 'month'. You can override using the `.groups` argument.
P1
snap_calendar <- calendar %>%
select(date, starts_with('snap')) %>%
pivot_longer(starts_with("snap"), names_to = "state", values_to = "snap") %>%
mutate(state = str_sub(state, 6,7)) %>%
tk_augment_timeseries_signature()
## tk_augment_timeseries_signature(): Using the following .date_var variable: date
P1 <- snap_calendar %>%
filter(!is.na(day)) %>%
filter(year == 2015 & month <=6 & state == 'TX') %>%
ggplot(aes(mweek, fct_rev(wday.lbl), fill = as.logical(snap))) +
geom_tile(colour = "white") +
geom_text(aes(label = day), size = 2) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
facet_grid(year~month.lbl) +
theme_tufte() +
theme(legend.position = "none", axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 10)) +
labs(x="", y="")
P2 <- snap_calendar %>%
filter(!is.na(day)) %>%
filter(year == 2015 & month >=7 & state == 'TX') %>%
ggplot(aes(mweek, fct_rev(wday.lbl), fill = as.logical(snap))) +
geom_tile(colour = "white") +
geom_text(aes(label = day), size = 2) +
scale_fill_manual(values = c("grey", "deepskyblue2")) +
facet_grid(year~month.lbl) +
theme_tufte() +
theme(legend.position = "none", axis.text.x = element_blank(), axis.ticks.x = element_blank(),
axis.text.y = element_text(size = 10)) +
labs(x="", y="")
P1 / P2 +
plot_annotation(title = 'SNAP day for Texas(TX)')
snap_calendar <- calendar %>%
select(date, starts_with('snap')) %>%
pivot_longer(starts_with("snap"), names_to = "state", values_to = "snap") %>%
mutate(state = str_sub(state, 6,7)) %>%
rename(state_id = state)
df <- monthly_sales_cat_df %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1) %>%
rename(state_id = id) %>%
left_join(snap_calendar, by = c("dates" = "date", "state_id")) %>%
filter(!str_detect(as.character(dates), "-12-25")) %>%
mutate(snap = as.logical(snap)) %>%
group_by(state_id, cat_id) %>%
mutate(loess = predict(loess(sales ~ as.integer(dates - min(dates)) + 1, span = 1/2, degree = 1)),
mean_sales = mean(sales)) %>%
mutate(sales_rel = (sales - loess)/mean_sales) %>%
ungroup()
P1 <- df %>%
filter(state_id == "TX" & cat_id == "FOODS") %>%
mutate(wday = wday(dates, label = TRUE, week_start = 1),
month = month(dates, label = TRUE),
year = year(dates)) %>%
group_by(wday, month, snap) %>%
summarise(sales = sum(sales_rel)) %>%
pivot_wider(names_from = "snap", values_from = "sales", names_prefix = "snap") %>%
mutate(snap_effect = snapTRUE - snapFALSE) %>%
ggplot(aes(month, wday, fill = snap_effect)) +
geom_tile() +
#labs(x = "Month of the year", y = "Day of the week", fill = "SNAP effect") +
scale_fill_distiller(palette = "Spectral") +
labs(x = "", y = "", fill = "SNAP effect",title = "SNAP impact by weekday & month",
subtitle = "Relative sales of SNAP days - other days. Only FOODS category and state TX")
## `summarise()` has grouped output by 'wday', 'month'. You can override using the `.groups` argument.
P1
min_date <- date("2011-01-29")
event_calendar <- calendar %>%
select(date,event_name_1, event_type_1) %>%
rename(event_type = event_type_1,
event_name = event_name_1)
monthly_sales_cat_df <- sales_train %>%
group_by(state_id, cat_id) %>%
summarise_at(vars(starts_with("d_")),sum) %>%
rename(id = state_id)
monthly_sales_cat_df_ts <- monthly_sales_cat_df %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1)
df1 <- monthly_sales_cat_df_ts %>%
left_join(event_calendar, by = c("dates"= "date")) %>%
filter(!str_detect(as.character(dates), "-12-25")) %>%
mutate(year = year(dates),
month = month(dates, label = TRUE)) %>%
filter(year == 2015)
P1 <- df1 %>%
group_by(month, id, event_type) %>%
summarise(count = n()) %>%
drop_na() %>%
filter(id == 'TX') %>%
ungroup() %>%
ggplot(aes(x = month, y = count, fill = event_type)) +
geom_col() +
scale_fill_manual(values = c("grey", "grey", "grey", "orange")) +
labs(title = 'Total number of event by event type of 2015')
## `summarise()` has grouped output by 'month', 'id'. You can override using the `.groups` argument.
P1
df1 %>%
select(dates, event_name, event_type, month) %>%
drop_na() %>%
filter(id == 'TX' & month == c("Feb", "Jun") & event_type == 'Sporting') %>%
arrange(dates) %>%
kable() %>% kable_styling()
## Adding missing grouping variables: `id`
| id | dates | event_name | event_type | month |
|---|---|---|---|---|
| TX | 2015-02-01 | SuperBowl | Sporting | Feb |
| TX | 2015-06-04 | NBAFinalsStart | Sporting | Jun |
| TX | 2015-06-16 | NBAFinalsEnd | Sporting | Jun |
| TX | 2015-06-16 | NBAFinalsEnd | Sporting | Jun |
df2 <- monthly_sales_cat_df %>%
pivot_longer(starts_with("d_"), names_to = "dates", values_to = "sales") %>%
mutate(dates = as.integer(str_remove(dates,"d_"))) %>%
mutate(dates = min_date + dates - 1) %>%
rename(state_id = id) %>%
left_join(event_calendar, by = c("dates"= "date")) %>%
filter(!str_detect(as.character(dates), "-12-25")) %>%
mutate(is_event = !is.na(event_type)) %>%
#filter(event_type == "Sporting" | is.na(event_type)) %>%
group_by(state_id, cat_id) %>%
mutate(loess = predict(loess(sales ~ as.integer(dates - min(dates)) + 1, span = 1/2, degree = 1)),
mean_sales = mean(sales)) %>%
mutate(sales_rel = (sales - loess)/mean_sales) %>%
ungroup()
P2 <- df2 %>%
filter(state_id == "TX" & cat_id == "FOODS") %>%
mutate(wday = wday(dates, label = TRUE, week_start = 1),
month = month(dates, label = TRUE),
year = year(dates)) %>%
group_by(wday, month, is_event) %>%
summarise(sales = sum(sales_rel)) %>%
pivot_wider(names_from = "is_event", values_from = "sales", names_prefix = "is_event") %>%
mutate(event_effect = is_eventTRUE - is_eventFALSE) %>%
ggplot(aes(month, wday, fill = event_effect)) +
geom_tile() +
scale_fill_distiller(palette = "Spectral") +
labs(x = "", y = "", fill = "Event effect",title = "Event impact by weekday & month",
subtitle = "Relative sales of days - other days. Only FOODS category and state TX")
## `summarise()` has grouped output by 'wday', 'month'. You can override using the `.groups` argument.
P2